{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huseinhouse-storage.s3-ap-southeast-1.amazonaws.com/bert-bahasa/singlish.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19870767"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('singlish.txt') as fopen:\n",
    "    singlish = fopen.read().split('\\n')\n",
    "    \n",
    "len(singlish)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from tqdm import tqdm\n",
    "import cleaning\n",
    "\n",
    "def preprocessing(string):\n",
    "    string = re.sub(\n",
    "        'http\\S+|www.\\S+',\n",
    "        '',\n",
    "        ' '.join(\n",
    "            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]\n",
    "        ),\n",
    "    )\n",
    "    \n",
    "    chars = ',.()!:\\'\"/;=-'\n",
    "    for c in chars:\n",
    "        string = string.replace(c, f' {c} ')\n",
    "        \n",
    "    string = re.sub(\n",
    "        u'[0-9!@#$%^&*()_\\-+{}|\\~`\\'\";:?/.>,<]',\n",
    "        ' ',\n",
    "        string,\n",
    "        flags = re.UNICODE,\n",
    "    )\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    \n",
    "    return string.lower()\n",
    "\n",
    "def loop(strings):\n",
    "    for i in tqdm(range(len(strings))):\n",
    "        strings[i] = preprocessing(strings[i])\n",
    "    return strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 764260/764260 [00:12<00:00, 61785.60it/s]\n",
      "100%|██████████| 764260/764260 [00:14<00:00, 52159.97it/s]\n",
      "100%|██████████| 764260/764260 [00:16<00:00, 46731.43it/s]\n",
      "100%|██████████| 764260/764260 [00:15<00:00, 50152.13it/s]\n",
      " 93%|█████████▎| 710055/764260 [00:14<00:01, 29858.39it/s]\n",
      "100%|██████████| 764260/764260 [00:15<00:00, 48047.03it/s]\n",
      "100%|██████████| 764260/764260 [00:16<00:00, 46528.10it/s]\n",
      "100%|██████████| 764260/764260 [00:18<00:00, 41971.88it/s]\n",
      "100%|██████████| 764260/764260 [00:16<00:00, 46111.23it/s]\n",
      "100%|██████████| 764260/764260 [00:16<00:00, 45560.89it/s]\n",
      "100%|██████████| 764260/764260 [00:16<00:00, 45848.13it/s]\n",
      " 67%|██████▋   | 512286/764260 [00:11<00:04, 59188.06it/s]\n",
      "100%|██████████| 7/7 [00:06<00:00,  1.03it/s]58953.11it/s]\n",
      "100%|██████████| 764260/764260 [00:16<00:00, 46383.69it/s]\n",
      "100%|██████████| 764260/764260 [00:16<00:00, 45168.64it/s]\n",
      "100%|██████████| 764260/764260 [00:16<00:00, 45444.94it/s]\n",
      "100%|██████████| 764260/764260 [00:17<00:00, 42897.90it/s]\n",
      "100%|██████████| 764260/764260 [00:15<00:00, 49757.29it/s]\n",
      "100%|██████████| 764260/764260 [00:16<00:00, 46439.32it/s]\n",
      "100%|██████████| 764260/764260 [00:17<00:00, 44623.79it/s]\n",
      "100%|██████████| 764260/764260 [00:16<00:00, 46068.07it/s]\n",
      "100%|██████████| 764260/764260 [00:15<00:00, 48206.16it/s]\n",
      "100%|██████████| 764260/764260 [00:14<00:00, 51587.59it/s]\n",
      "100%|██████████| 764260/764260 [00:16<00:00, 47108.30it/s]\n",
      "100%|██████████| 764260/764260 [00:18<00:00, 41325.01it/s]\n",
      "100%|██████████| 764260/764260 [00:15<00:00, 49281.21it/s]\n",
      "100%|██████████| 764260/764260 [00:18<00:00, 42117.68it/s]\n"
     ]
    }
   ],
   "source": [
    "singlish = cleaning.multiprocessing(singlish, loop, cores = 26)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://guidesify.com/blog/2017/08/13/singlish-phrases-define-singapore/\n",
    "\n",
    "manglish_vocab = {\n",
    "    'siasui', 'lah', 'chun', 'kapster', 'leh', 'lansi', 'lan si', 'meh',\n",
    "    'stim', 'maluation', 'kantoi', 'seh', 'yam', 'hor', 'la', 'cha',\n",
    "    'tao', 'amoi', 'aiya', 'angmor', 'angpau', 'beng', 'chow', 'batam',\n",
    "    'liao', 'nian', 'buiji', 'hou', 'guo', 'jiang', 'chiu', 'buji',\n",
    "    'hao', 'kam', 'wan', 'yao', 'cao', 'ciao', 'jin', 'hoseh',\n",
    "    'jiak', 'ying', 'leybit', 'sibei', 'laobu', 'sia', 'cilok',\n",
    "    'cibai', 'cb', 'entao', 'gwai', 'kai', 'kongmong', 'kapcai',\n",
    "    'lanjiao', 'lancau', 'lalazai', 'momantai', 'paikia', 'paiseh',\n",
    "    'pokai', 'seow', 'sohai', 'sueh', 'tapau', 'wor', 'hor',\n",
    "    'terrer', 'chop', 'lansi', 'paiseh', 'syok', 'shiok',\n",
    "    'sibeh', 'kawkaw', 'abuden', 'mah', 'lor', 'paiseh',\n",
    "    'niang', 'aiya', 'kena', 'aiyo', 'moh', 'bojio',\n",
    "    'buay', 'kia', 'chao', 'chim', 'cheem', 'chiong',\n",
    "    'chiobu', 'dabao', 'kiang', 'hosei', 'hoseh',\n",
    "    'jialat', 'kaypoh', 'kenasai', 'liddat', 'machiam',\n",
    "    'nehmind', 'pokkai', 'shiok', 'siol', 'smlj', 'suan',\n",
    "    'suay', 'swaku', 'swee', 'tyco', 'wapiang', 'walao', 'zhun',\n",
    "    'walau', 'ngaidi', 'ngaidiao', 'jilo', 'kampung',\n",
    "    'kaobu', 'siao', 'obiang', 'orredy', 'oredy',\n",
    "    'pantang', 'pokai', 'siam', 'tangi', 'talk cock', 'tombalek',\n",
    "    'womit', 'yandao'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 19870767/19870767 [00:46<00:00, 428617.22it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "manglish = []\n",
    "for s in tqdm(singlish):\n",
    "    if len(set(s.split()) & manglish_vocab):\n",
    "        manglish.append(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(908120, 19870767)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(manglish), len(singlish)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('manglish.json', 'w') as fopen:\n",
    "    json.dump(manglish, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import boto3\n",
    "\n",
    "# bucketName = 'malaya-dataset'\n",
    "\n",
    "# outPutname = f\"dumping/twitter/manglish.json\"\n",
    "# s3 = boto3.client('s3')\n",
    "# s3.upload_file('manglish.json',bucketName,outPutname)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
